#install.packages("Matrix")
library(Matrix)
library(pdftools) # Reads pdf documents into text strings
library(tm) # Text cleaning for large corpa similar to tidytext it can help with cleaning and tokenizing
library(quanteda) # Text cleaning for large corpa similar to tidytext tokenizing
library(tidytext) # For analysis of text in a tidy manner including sentiment data
library(textstem) # For stemming and lemmatizing text
library(gutenbergr) # Project Gutenberg books
library(wordcloud) # For world cloud

library(lsa) # For latent semantic analysis
library(stm) # For structural topic modeling
library(uwot) # For umap dimensionality reduction
library(text2vec) # For cosine similarity 
library(kernlab) # For kernel-based cannonical correlation analysis
library(rPref) # For pareto frontier\
library(DT) # For interactive data tables
library(textdata) # Database of lexicon and embeddings

library(knitr)
library(ggrepel)
library(caret) # For predictive model fitting
library(tidyverse)
library(patchwork)

set.seed(888)
rm(list = ls()) 
## Read files
# https://serialmentor.com/blog/2016/6/13/reading-and-combining-many-tidy-data-files-in-R
# This code assumes that the documents are in a directory within the working directory'
# Each document read into a dataframe as a character vector

data_path = "/Users/nitaykenig/Downloads/Readings ISyE 562"   # folder that contains the papers
files = dir(data_path, pattern = "*.pdf") # files to read

docs.df = data_frame(document = files) %>%         # creates a data frame with file names
       mutate(text = map(document, ~ pdf_text(file.path(data_path, .))) # reads and converts pdf files to text
        )  


## Creates a clean name and separates file name into file number and name
docs.df$document = gsub('.pdf', '', docs.df$document)  # removes extension from file name

docs.df = docs.df %>% separate(document, into = c("number", "title"), # creates two variables from file name
                sep = "\\.", remove = FALSE)

rm(files, data_path) 
## Clean data by "searches and replace" statements 
docs.df$text = gsub('[0-9]+', '', docs.df$text) # Removes words that include numbers
docs.df$text = gsub('[.]+', '', docs.df$text) # Removes words with periods
docs.df$text = gsub('doi', '', docs.df$text) # Removes non-word "doi"
docs.df$text = gsub('fig', '', docs.df$text) # Removes non-word
docs.df$text = gsub('zij', '', docs.df$text) # Removes non-word 
docs.df$text = gsub('smm', '', docs.df$text) # Removes non-word 

###Words removed that have no relevance to the overall cluster necessary for the assignment.

  
## Tokenize based on word as token and remove punctuation and convert to lower case
text.df = docs.df %>% 
    unnest_tokens(term, text, token = "words", 
                 to_lower = TRUE, 
                 strip_punct = TRUE) 
  
## Remove one-letter and two-letter words
  text.df = text.df %>% filter(str_length(term)>2)
  
## Remove very long words--spurious words created by pdf reader
  text.df = text.df %>% filter(str_length(term)<15)
  
## Remove stopwords
  text.df = text.df %>% 
    anti_join(get_stopwords(), by = c("term" = "word"))
  

## Stem and lemmatize using textstem package
# This step can be useful when the sample is relatively small  and some important aspects might be lost if similar words were eliminated
# because they occur infrequently in their inflected variations (e.g., compute, and computed)

# text.df$term = stem_words(text.df$term) # Converts word to its stem, which might not be a word, such as "computational" >> "comput"
# Stem completion can convert back to a word based on the most frequent original form

text.df$term = lemmatize_words(text.df$term) # Similar to stemming, but returns a word and takes longer


## Plot number of words remaining after processing documents
text.df %>% count(document) %>% 
ggplot(aes(n, reorder(document, -n))) +
  geom_col()+
  labs(x = "Total words in document", y = "")

## Calculate term frequency and add tf_idf variables
tfidf.text.df = text.df %>% count(document, term) %>% 
  bind_tf_idf(term, document, n)

## Filter infrequent words
tfidf.text.df = tfidf.text.df %>% filter(n>8)

## Filter indiscriminate words--very low tf_idf words
tfidf.text.df = tfidf.text.df %>% filter(tf_idf>.000001)

## Create wordcloud of 150 terms
wordcloud(tfidf.text.df$term, tfidf.text.df$n, min.freq = 25)
## Warning in wordcloud(tfidf.text.df$term, tfidf.text.df$n, min.freq = 25):
## explanation could not be fit on page. It will not be plotted.
## Warning in wordcloud(tfidf.text.df$term, tfidf.text.df$n, min.freq = 25): datum
## could not be fit on page. It will not be plotted.

## Plot most discriminating terms
top10.df = tfidf.text.df %>% group_by(document) %>% top_n(10, tf_idf) %>% 
  ungroup() %>% 
  mutate(document = as.factor(document))

ggplot(top10.df, aes(reorder_within(term, tf_idf, within = document), tf_idf)) +
  geom_col() +
  coord_flip() +
  facet_wrap(.~document, scales = "free")+
  scale_x_reordered() 

## Scatterplot of term frequency and inverse document frequency for each document
ggplot(top10.df, aes(idf, tf, size = tf_idf)) +
  geom_point(shape = 21, size = .75) +
  geom_text_repel(aes(label = term, size = tf_idf)) +
  facet_wrap(.~document) +
  theme_bw() +
  theme(legend.position = "none")
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps

## A single plot of the top tf_idf terms across all documents
ggplot(top10.df, aes(idf, tf, size = tf_idf)) +
  geom_point(shape = 21, size = 1) +
  geom_text_repel(aes(label = term, size = tf_idf)) +
  theme_bw() +
  coord_trans(y="log") +
  theme(legend.position = "none")
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

rm(top10.df)

###There does seeem to be a correlation for the week of articles and their clusters, however it is not significant so there is a possibility that there are better clustering methods to use.

# Convert from tidy format to termXdocument matrix
tdm_weighted.tdmat = cast_tdm(tfidf.text.df, term, document, tf_idf)
tdm_count.tdmat = cast_tdm(tfidf.text.df, term, document, n)

lsa_model <- lsa(tdm_count.tdmat,  dims=dimcalc_share(share = .75)) 
# dimcalc_share retains that dimensions that retain the required share of the total variance

## Dimensions of the LSA space
# The singular value has a maximum dimensions of the number of documents
dim(lsa_model$tk) # Terms x LSA space
## [1] 429   6
dim(lsa_model$dk) # Documents x LSA space
## [1] 11  6
length(lsa_model$sk) # Singular values
## [1] 6
## Shows expected value of word frequency that for each document
as.textmatrix(lsa_model)
## $matrix
##                        D1     D2    D3    D4    D5    D6    D7    D8    D9
## 1. contextual       21.48  -0.40  2.92 -0.02  0.28  1.54  0.18 -0.07  0.01
## 2. customer         55.65  -1.03  7.56 -0.05  0.72  3.98  0.46 -0.19  0.02
## 3. datum            18.30 156.56  3.37 46.13 22.61 37.79 14.03 37.28 10.48
## 4. design           95.54   9.66 14.05 -0.59  2.21  9.38  2.15 -1.02 -0.06
## 5. designer         13.57  -0.31  2.49  0.00  0.11  1.07  0.45 -0.05 -0.01
## 6. detail           12.83   0.74  1.73  1.70  0.54  1.12  0.20 -0.19 -0.03
## 7. develop           9.74  -0.50  1.06 22.41  3.70  0.70  0.32  0.00  0.01
## 8. diagram          11.72  -0.22  1.59 -0.01  0.15  0.84  0.10 -0.04  0.00
## 9. environment      12.04   2.64  1.70  0.40  0.49  1.55  0.40  0.82  1.07
## 10. february        10.74  -0.20  1.46 -0.01  0.14  0.77  0.09 -0.04  0.00
## 11. focus           13.67  -0.25  1.86 -0.01  0.18  0.98  0.11 -0.05  0.00
## 12. get              9.76  -0.18  1.33 -0.01  0.13  0.70  0.08 -0.03  0.00
## 214. counterfactual  0.98   2.74  0.16  0.00  0.27  0.74  0.25  1.03 -0.06
## 215. effect          1.61   4.50  0.26  0.00  0.45  1.21  0.41  1.69 -0.10
## 216. encode          0.77   2.15  0.12  0.00  0.21  0.58  0.19  0.81 -0.05
## 217. engine          0.63   1.76  0.10  0.00  0.17  0.48  0.16  0.66 -0.04
## 218. estimate        0.91   2.54  0.15  0.00  0.25  0.69  0.23  0.95 -0.06
## 219. figure          0.54   1.51  0.38  0.05 -0.20  2.08  1.45 21.58 17.92
## 220. fit             0.63   1.76  0.10  0.00  0.17  0.48  0.16  0.66 -0.04
## 221. form            0.70   1.96  0.11  0.00  0.19  0.53  0.18  0.73 -0.04
## 222. graphical       1.05   2.93  0.17  0.00  0.29  0.79  0.27  1.10 -0.07
## 223. hierarchy       0.91   2.54  0.15  0.00  0.25  0.69  0.23  0.95 -0.06
## 224. inference       1.50   4.77  0.26  0.13  0.49  1.27  0.45  1.73  0.23
## 225. level           2.24   6.26  0.36  0.00  0.62  1.69  0.57  2.35 -0.14
## 418. motivator      -0.15  -0.09  0.97  0.02 -0.10  0.14  0.52 -0.01 -0.01
## 419. nbehavior      -0.10  -0.06  0.63  0.01 -0.06  0.09  0.34 -0.01 -0.01
## 420. occur          -0.07  -0.04  0.44  0.01 -0.04  0.06  0.23  0.00 -0.01
## 421. pain           -0.07  -0.04  0.44  0.01 -0.04  0.06  0.23  0.00 -0.01
## 422. perform        -0.18  -0.11  1.16  0.03 -0.12  0.17  0.62 -0.01 -0.02
## 423. person         -0.07  -0.05  0.48  0.01 -0.05  0.07  0.26  0.00 -0.01
## 424. persuasive     -0.20  -0.13  1.36  0.03 -0.14  0.20  0.73 -0.01 -0.02
## 425. simplicity     -0.20  -0.12  1.31  0.03 -0.13  0.19  0.70 -0.01 -0.02
## 426. spark          -0.08  -0.05  0.53  0.01 -0.05  0.08  0.29 -0.01 -0.01
## 427. target         -0.25  -0.15  1.65  0.04 -0.17  0.24  0.88 -0.02 -0.03
## 428. trigger        -0.53  -0.33  3.53  0.08 -0.36  0.51  1.89 -0.03 -0.05
## 429. type           -0.12  -0.08  0.82  0.02 -0.08  0.12  0.44 -0.01 -0.01
##                       D10   D11
## 1. contextual       -0.06 -0.16
## 2. customer         -0.16 -0.42
## 3. datum            19.58 -0.55
## 4. design            1.33 17.70
## 5. designer          0.05 12.86
## 6. detail            0.06 -0.15
## 7. develop          -0.42 -0.05
## 8. diagram          -0.03 -0.09
## 9. environment       0.35  0.95
## 10. february        -0.03 -0.08
## 11. focus           -0.04 -0.10
## 12. get             -0.03 -0.07
## 214. counterfactual  0.35  0.10
## 215. effect          0.58  0.16
## 216. encode          0.28  0.08
## 217. engine          0.23  0.06
## 218. estimate        0.33  0.09
## 219. figure          0.64 13.00
## 220. fit             0.23  0.06
## 221. form            0.25  0.07
## 222. graphical       0.38  0.10
## 223. hierarchy       0.33  0.09
## 224. inference       0.62  0.45
## 225. level           0.81  0.22
## 418. motivator       0.13 19.94
## 419. nbehavior       0.09 12.96
## 420. occur           0.06  8.97
## 421. pain            0.06  8.97
## 422. perform         0.16 23.92
## 423. person          0.07  9.97
## 424. persuasive      0.19 27.91
## 425. simplicity      0.18 26.91
## 426. spark           0.07 10.96
## 427. target          0.23 33.89
## 428. trigger         0.49 72.76
## 429. type            0.11 16.95
## 
## $legend
##  [1] "D1 = 1. Beyer, Holtzblatt_1999_Contextual Design"                                                               
##  [2] "D2 = 1. Olhede, Wolfe_The growing ubiquity of algorithms in society implications , impacts and innovations_2018"
##  [3] "D3 = 2. HFE Design and Systems thinking"                                                                        
##  [4] "D4 = 3. Parker_2017_Opinionated analysis development"                                                           
##  [5] "D5 = 3. Sandve et al._2013_Ten simple rules for reproducible computational research"                            
##  [6] "D6 = 4. Pearl_2019_The seven tools of causal inference, with reflections on machine learning copy"              
##  [7] "D7 = 5. Hogarth, Lejarraga, Soyer_2015_The Two Settings of Kind and Wicked Learning Environments"               
##  [8] "D8 = 6. Zhang, Goncalves_Why should I trust you Explaining the predictions of any classifier_2015"              
##  [9] "D9 = 7. Bolukbasi et al._Man is to computer programmer as woman is to homemaker debiasing word embeddings_2016" 
## [10] "D10 = 7. Courtland_The bias detectives_2018"                                                                    
## [11] "D11 = 8. Fogg_2009_A behavior model for persuasive design"
## Calculates LSA on tf_idf weighted terms
lsa_model = lsa(tdm_weighted.tdmat,  dims=dimcalc_share(share = .75))

rm(tdm_count.tdmat)
## Cosine similarity is equal to 1 for identical documents
doc.similiarity.mat = cosine(t(lsa_model$dk)) # The d component describes the documents 

## Calculates the mean tf_idf of each term and selects top 70 
temp = tfidf.text.df %>% 
  group_by(term) %>% 
  summarise(m.tf_idf = mean(tf_idf)) %>% 
  cbind(lsa_model$tk) %>% top_n(70, m.tf_idf) 

# Cosine similarity of terms
row.names(temp)= temp$term
term.similiarity.mat = cosine(t(temp %>% select(-term, -m.tf_idf))) 

doc.dissimilarity.dist = as.dist(1-doc.similiarity.mat)
term.dissimilarity.dist = as.dist(1-term.similiarity.mat)

## Hierarchical clustering
# Setting method = ward.d2 corresponds to agnes clustering
doc.cluster = hclust(doc.dissimilarity.dist, method = "ward.D2", members = NULL)
plot(doc.cluster)

term.cluster = hclust(term.dissimilarity.dist, method = "ward.D2", members = NULL)
plot(term.cluster)

rm(temp)
term.umap = umap(lsa_model$tk, n_neighbors = 20, n_components = 2) %>% 
  as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
names(term.umap) = c("umap_1", "umap_2")

## Add tf_idf information
term.umap = tfidf.text.df %>% group_by(term) %>% 
  summarise(m.tf_idf = max(tf_idf)) %>%
  cbind(term.umap) 

## Plot UMAP of terms
term.umap.plot = 
  ggplot(term.umap, aes(umap_1, umap_2)) +
  geom_point(aes(size=m.tf_idf),shape = 21) +
  geom_label_repel(data = term.umap %>% top_n(175, m.tf_idf),
                   aes(label = term), alpha = .6, size = 3) +
  labs(title = "UMAP clustering", subtitle = "20 nearest neighbors") +
  theme_void()+
  theme(legend.position = "none")
term.umap.plot
## Warning: ggrepel: 158 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Plot UMAP of documents
doc.umap = 
  umap(lsa_model$dk, n_neighbors = 3, n_components = 2) %>% 
  as.tibble()
names(doc.umap) = c("umap_1", "umap_2")
doc.umap = cbind(docs.df, doc.umap)

doc.umap.plot = 
  ggplot(doc.umap, aes(umap_1, umap_2)) +
  geom_point(aes(colour = number), size = 2.5) +
  geom_label_repel(aes(label = document, fill = number), alpha = .6, size = 3) +
  labs(title = "UMAP clustering", subtitle = "3 nearest neighbors") +
  theme(legend.position = "none")
doc.umap.plot

ggsave(doc.umap.plot, filename = "doc.umap.pdf", width = 8, height = 8)

rm(term.umap, term.cluster, doc.umap.plot, doc.umap)
## Convert tidytext to spars metrix for stm analysis
text.sparse = tfidf.text.df %>% cast_sparse(document, term, n)

## Fit structural topic model for a range of topics
multi_stm.fit = searchK(text.sparse, 
                      K= c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), 
                      M = 12, # number of top words for exclusivity calculation
                      init.type = "Spectral",
                      N = 4, # Number of documents partially held out, 10% by default
                      proportion = 0.5, # Held out documents for likelihood calculation
                      heldout.seed = 888, cores = 4)
## Using multiple-cores.  Progress will not be shown.
## Plot topic metrics
# Higher heldout likelihood the better 
# Lower residual dispersion the better
# Higher semantic coherence and exclusivity
# Higher lower bound of the marginal likelihood (evidence)
plot(multi_stm.fit)

## Plot pareto curve of exclusivity and coherence 
# Coherence (high probablity terms of a topic occur together in documents)
# Exclusivity (high probablity terms of a topic are not high probability tersm in other topics)
# Mimno, D., Wallach, H. M., Talley, E., & Leenders, M. (2011). Optimizing Semantic Coherence in Topic Models, (2), 262–272.

multifit.results = multi_stm.fit$results

rm(LikelihoodBound.plot, multifit.results, CoherenceExclusivity.plot, sky)
## Warning in rm(LikelihoodBound.plot, multifit.results,
## CoherenceExclusivity.plot, : object 'LikelihoodBound.plot' not found
## Warning in rm(LikelihoodBound.plot, multifit.results,
## CoherenceExclusivity.plot, : object 'CoherenceExclusivity.plot' not found
## Warning in rm(LikelihoodBound.plot, multifit.results,
## CoherenceExclusivity.plot, : object 'sky' not found
## Convert tidy data to sparse matrix
text.sparse = tfidf.text.df %>% cast_sparse(document, term, n)

## Fit topic model
topic_model = stm(text.sparse, K = 4, 
                   control = list(eta = .01, alpha = 50/8), 
                  # eta sets the topic-word hyperparameter . Defaults to .01
                  # alpha sets the prevalence hyperparameter. Defaults to 50/K),
                  verbose = FALSE, init.type = "Spectral")

## Extract word-topic combinations
td_beta = tidy(topic_model) 

## Extract terms with the highest proportion in topics
td_beta =
   td_beta %>%
    group_by(topic) %>%
    top_n(10, beta) %>%
    ungroup()

## Add FREX-base based labels
# FREX words distinguish topics because they are both frequent and exclusive
frex = labelTopics(topic_model, n = 4)$frex %>% as.tibble() %>% unite(col = topic_name) 
frex = cbind(topic = labelTopics(topic_model, n = 3)$topicnums, frex)
frex$topic_number_name = paste(frex$topic, frex$topic_name, sep = "-") 
td_beta = left_join(td_beta, frex)

  
## Plot term proportion for labeled topics  
td_beta_ordered = td_beta %>% 
  mutate(term_ordered = reorder_within(term , beta, topic))

ggplot(td_beta_ordered, aes(term_ordered, beta)) +
  geom_col() +
  coord_flip() +
  facet_wrap(.~topic_number_name, scales = "free")+
  labs(title = "Prevalence of terms across topics", 
         x = "term", y = "Term prevalence (beta)") +
  scale_x_reordered() 

## Extract document-topic combinations
td_gamma =
    tidy(topic_model, matrix = "gamma",
         document_names = rownames(text.sparse))

## Plot prevalence of topics across documents
td_gamma = 
    td_gamma %>% group_by(document) %>%
    mutate(dominant_topic = which.max(gamma))

topic_names = td_beta %>% select(topic, topic_number_name) %>% distinct()
td_gamma = left_join(td_gamma, topic_names , by = "topic")
  
ggplot(td_gamma, aes(as.factor(topic), gamma, fill = as.factor(topic_number_name))) +
    geom_col(width = .8, position = position_dodge(width = .2, preserve = "single")) +
    facet_grid(reorder(interaction(dominant_topic, document), dominant_topic)~.,
               scales = "free_x", drop = TRUE) +
    labs(title = "Prevalence of topics across documents", subtitle = "Most documents have one topic",
         x = "Topic", y = "Topic prevalence (gamma)") +
    theme(legend.position = "bottom", 
            axis.text.y = element_blank(),
            axis.ticks = element_blank(),
             strip.text.y = element_text(angle =0, hjust = 0))

rm(td_beta, td_gamma, frex, topic_names)

###I chose 4 groups to have a clearer cluster. Going from left to right in the cluster groups, on the left, indiciated by pink, the cluster could have bias from embeddings. On the right it would be considered persuaded.

## Sentiment dictionaries from tidytext
get_sentiments("afinn") %>% datatable() # Numeric positive and negative sentiment
get_sentiments("bing") %>% datatable() # Categorical positive and negative
#get_sentiments("nrc")[1:1000, ] %>% datatable() # Multiple emotional categories--trust, fear, anger, joy, disgust...
## Calculate the total positive and negative sentiment per document
sentiment.df = text.df %>% group_by(document) %>% 
  mutate(total_terms = n()) %>% ungroup() %>% # Count terms first to identify the proportion of sentimental terms
  inner_join(get_sentiments("bing"), by=c("term" = "word")) %>%
  group_by(document, sentiment) %>% 
  summarise(n = n(), total_terms= first(total_terms)) %>% 
  ungroup() %>% group_by(document) %>% 
  mutate(proportion = n/total_terms) %>% group_by(document, sentiment) %>% 
  mutate(signed.proportion = replace(proportion, sentiment=="negative", -proportion)) %>% 
  ungroup() %>% 
  group_by(document) %>% 
  mutate(sum_sentiment = sum(signed.proportion))
## `summarise()` has grouped output by 'document'. You can override using the
## `.groups` argument.
## Plot sentiment per document
ggplot(sentiment.df, 
       aes(reorder(document, sum_sentiment), signed.proportion, fill = sentiment))+
  geom_col() +
  coord_flip() +
  labs(x = "Documents ordered by net proportion of positive sentiment", 
       y = "Proportion of positive and negative words") +
  theme(legend.position = "none")

influence_sentiment.df = text.df %>% 
  inner_join(get_sentiments("bing"), by=c("term" = "word")) %>% 
  count(document, sentiment, term) %>% 
  group_by(document, sentiment) %>% 
  top_n(n, n = 5) %>% slice(1:5) %>% # Limits to 5 in case of ties
  mutate(signed.sentiment = replace(n, sentiment=="negative", -n))  
  

ggplot(influence_sentiment.df, aes(reorder_within(term, n, document), signed.sentiment, fill = sentiment)) +
  geom_col() +
  coord_flip() +
  facet_wrap(.~document, scales = "free")+
  scale_x_reordered() +
  labs(x= "Words that strongly contribute to document sentiment", y = "Positive and negative sentiment") +
  theme(legend.position = "none")

###Initially I would think that greedy would be considered the algorithm but they are discussing the positive/negative words. Looking at these graphs it makes a lot of sense to see words such as bias and problem as extremely negative and words such as support and trust as extremly positive.

## To run this code you must first download the embeddings file from: https://nlp.stanford.edu/projects/glove/
# This is a large file (800MB) so it takes a bit of time and disk space


glove = read_delim(file = "/Users/nitaykenig/Downloads/glove.6B/glove.6B.100d.txt", 
                   progress =FALSE,
                   col_names = FALSE, delim = " ", quote = "")
names(glove)[1] = "token"

## Extract vectors for king, man, and queen
man = glove %>% filter(token=="man") %>% select(-token)
son = glove%>%filter(token=="son") %>% select(-token)
woman = glove%>%filter(token=="woman") %>% select(-token)

## Calculate woman analog of son
# daughter-woman = son-man
female_child = son-man+woman

## Convert the glove and female_child data frames into vectors
x = glove %>% select(-token) %>% as.matrix() # The 
y = as.matrix(female_child) # Vector 

library(text2vec)
## Calculate the cosine distance between "female_child" and all other word embedding
cos_sim = sim2(x, y, method = "cosine", norm = "l2")

# Identify and plot similar words
similar_words = cbind(glove$token, cos_sim) %>% # Matches word embedding to words
  as.tibble() %>% 
  mutate(term = V1, similarity = as.numeric(V2)) %>% 
  arrange(desc(similarity)) %>% dplyr::slice(1:15)

ggplot(similar_words, aes(reorder(term, similarity), similarity)) +
  geom_col() +
  coord_flip() +
  labs(title = "Words that are closest to: female_child = son-man+woman", 
       x = "Closest terms", y = "Similarity")

## Similar to sentiment lexicon, word embeddings can be added to describe the terms in documents
glovec.text.df = text.df %>% 
  inner_join(glove, by=c("term" = "token"))

## Document embeddings can be created by averaging the term embedding
s.glovec.text.df = glovec.text.df %>% 
  gather(key = glovec_id, value = glovalue, contains("X")) %>% 
  group_by(document, glovec_id) %>% 
  summarise(m.glovalue = mean(glovalue)) %>% 
  spread(key = glovec_id, value = m.glovalue) %>% 
  ungroup()

## Calculate document distance based on cosine similarity of generic embedding 

doc.similiarity.mat = cosine(t(s.glovec.text.df %>% select(contains("X")) %>% as.matrix()))  
row.names(doc.similiarity.mat) = as.vector(s.glovec.text.df$document)
  
doc.dissimilarity.dist = as.dist(1-doc.similiarity.mat)


## Hierarchical clustering of documents
# Setting method = ward.d2 corresponds to agnes clustering
doc.cluster = hclust(doc.dissimilarity.dist, method = "ward.D2", members = NULL)
plot(doc.cluster)

# UMAP of documents
 doc.umap = s.glovec.text.df %>% select(starts_with("X")) %>% 
   umap(n_neighbors = 3, n_components = 2) %>% 
   as.tibble()
 names(doc.umap) = c("umap_1", "umap_2")
 doc.umap = cbind(docs.df, doc.umap)
 

doc.umap.plot = 
  ggplot(doc.umap, aes(umap_1, umap_2)) +
  geom_point(aes(colour = number), size = 2.5) +
  geom_label_repel(aes(label = document, fill = number), alpha = .6, size = 3) +
  labs(title = "UMAP clustering", subtitle = "3 nearest neighbors") +
  theme_void()+
  theme(legend.position = "none")
doc.umap.plot

## Match terms to glove embedding
terms.df = tfidf.text.df %>% group_by(term) %>% 
  summarise(m.tf_idf = max(tf_idf)) %>% 
  inner_join(glove, by=c("term" = "token"))

## Calculate umap dimensions
term.umap = terms.df %>% select(starts_with("X")) %>% 
  umap(n_neighbors = 10, n_components = 2) %>% 
  as.tibble()
names(term.umap) = c("umap_1", "umap_2")

term.umap =  terms.df %>%
  cbind(term.umap) 


## Plot UMAP of terms
term.umap.plot = 
  ggplot(term.umap, aes(umap_1, umap_2)) +
  geom_point(aes(size=m.tf_idf),shape = 21) +
  geom_label_repel(data = term.umap %>% top_n(100, m.tf_idf),
                   aes(label = term), alpha = .6, size = 3) +
  labs(title = "UMAP clustering", subtitle = "20 nearest neighbors") +
  theme_void()+
  theme(legend.position = "none")
term.umap.plot
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Uses kernlab for kernel-base canonical correlation 

combined.embedding = cbind(s.glovec.text.df, lsa_model$dk)

glovector = s.glovec.text.df %>% select(contains("X")) %>% as.matrix


combined.embedding.kcca = kcca(lsa_model$dk, glovector,
     kernel="rbfdot", kpar=list(sigma=0.1),
     gamma = 0.1, ncomps = 7)

combined.embedding.kcca.mat = cbind(combined.embedding.kcca@xcoef, combined.embedding.kcca@xcoef)


doc.similiarity.mat = cosine(t(combined.embedding.kcca.mat))  
row.names(doc.similiarity.mat) = as.vector(s.glovec.text.df$document)
  
doc.dissimilarity.dist = as.dist(1-doc.similiarity.mat)


## Hierarchical clustering
# Setting method = ward.d2 corresponds to agnes clustering
doc.cluster = hclust(doc.dissimilarity.dist, method = "ward.D2", members = NULL)
plot(doc.cluster)

#help from Colleen Wall with this code and the following chunk
umap_df = s.glovec.text.df %>% select(starts_with("X")) %>% 
   umap(n_neighbors = 5, n_components = 2) %>% 
   as.tibble()
 names(umap_df) = c("umap_1", "umap_2")
 umap_df = cbind(docs.df, umap_df)
 
doc.umap.plot1 = 
  ggplot(umap_df, aes(umap_1, umap_2)) +
  geom_point(aes(colour = number), size = 3) +
  geom_label_repel(aes(label = document, fill = number), alpha = .6, size = 3) +
  labs(title = "UMAP clustering", subtitle = "5 nearest neighbors") +
  #theme_void()+
  theme(legend.position = "none")
doc.umap.plot1

ggplot(umap_df, aes(umap_1, umap_2)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "none") +
  labs(x = "UMAP_1", y = "UMAP_2", title = "UMAP Scatterplot of Terms")

###The scatterplot illustrates the correlation for certain weeks readings more clearly than the dendogram. Such being said, this is still not a perfect correlation for the weeks.